/* ***** BEGIN LICENSE BLOCK *****
 * Version: RCSL 1.0/RPSL 1.0
 *
 * Portions Copyright (c) 1995-2002 RealNetworks, Inc. All Rights Reserved.
 *
 * The contents of this file, and the files included with this file, are
 * subject to the current version of the RealNetworks Public Source License
 * Version 1.0 (the "RPSL") available at
 * http://www.helixcommunity.org/content/rpsl unless you have licensed
 * the file under the RealNetworks Community Source License Version 1.0
 * (the "RCSL") available at http://www.helixcommunity.org/content/rcsl,
 * in which case the RCSL will apply. You may also obtain the license terms
 * directly from RealNetworks.  You may not use this file except in
 * compliance with the RPSL or, if you have a valid RCSL with RealNetworks
 * applicable to this file, the RCSL.  Please see the applicable RPSL or
 * RCSL for the rights, obligations and limitations governing use of the
 * contents of the file.
 *
 * This file is part of the Helix DNA Technology. RealNetworks is the
 * developer of the Original Code and owns the copyrights in the portions
 * it created.
 *
 * This file, and the files included with this file, is distributed and made
 * available on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND REALNETWORKS HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, FITNESS
 * FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 *
 * Technology Compatibility Kit Test Suite(s) Location:
 *    http://www.helixcommunity.org/content/tck
 *
 * Contributor(s):
 *
 * ***** END LICENSE BLOCK ***** */

/**************************************************************************************
 * Fixed-point MP3 decoder
 * Jon Recker (jrecker@real.com), Ken Cooke (kenc@real.com)
 * June 2003
 *
 * polyphase.c - final stage of subband transform (polyphase synthesis filter)
 *
 * This is the C reference version using __int64
 * Look in the appropriate subdirectories for optimized asm implementations
 *   (e.g. arm/asmpoly.s)
 **************************************************************************************/

#include "coder.h"
#include "assembly.h"

/* input to Polyphase = Q(DQ_FRACBITS_OUT-2), gain 2 bits in convolution
 *  we also have the implicit bias of 2^15 to add back, so net fraction bits =
 *    DQ_FRACBITS_OUT - 2 - 2 - 15
 *  (see comment on Dequantize() for more info)
 */
#define DEF_NFRACBITS   (DQ_FRACBITS_OUT - 2 - 2 - 15)
#define CSHIFT  12  /* coefficients have 12 leading sign bits for early-terminating mulitplies */

static __inline short ClipToShort(int x, int fracBits)
{
    int sign;

    /* assumes you've already rounded (x += (1 << (fracBits-1))) */
    x >>= fracBits;

    /* Ken's trick: clips to [-32768, 32767] */
    sign = x >> 31;
    if (sign != (x >> 15))
        x = sign ^ ((1 << 15) - 1);

    return (short)x;
}

#define MC0M(x) { \
    c1 = *coef;     coef++;     c2 = *coef;     coef++; \
    vLo = *(vb1+(x));           vHi = *(vb1+(23-(x))); \
    sum1L = MADD64(sum1L, vLo,  c1);    sum1L = MADD64(sum1L, vHi, -c2); \
}

#define MC1M(x) { \
    c1 = *coef;     coef++; \
    vLo = *(vb1+(x)); \
    sum1L = MADD64(sum1L, vLo,  c1); \
}

#define MC2M(x) { \
        c1 = *coef;     coef++;     c2 = *coef;     coef++; \
        vLo = *(vb1+(x));   vHi = *(vb1+(23-(x))); \
        sum1L = MADD64(sum1L, vLo,  c1);    sum2L = MADD64(sum2L, vLo,  c2); \
        sum1L = MADD64(sum1L, vHi, -c2);    sum2L = MADD64(sum2L, vHi,  c1); \
}

/**************************************************************************************
 * Function:    PolyphaseMono
 *
 * Description: filter one subband and produce 32 output PCM samples for one channel
 *
 * Inputs:      pointer to PCM output buffer
 *              number of "extra shifts" (vbuf format = Q(DQ_FRACBITS_OUT-2))
 *              pointer to start of vbuf (preserved from last call)
 *              start of filter coefficient table (in proper, shuffled order)
 *              no minimum number of guard bits is required for input vbuf
 *                (see additional scaling comments below)
 *
 * Outputs:     32 samples of one channel of decoded PCM data, (i.e. Q16.0)
 *
 * Return:      none
 *
 * TODO:        add 32-bit version for platforms where 64-bit mul-acc is not supported
 *                (note max filter gain - see polyCoef[] comments)
 **************************************************************************************/
static void PolyphaseMono(short* pcm, int* vbuf, const int* coefBase)
{
    int i;
    const int* coef;
    int* vb1;
    int vLo, vHi, c1, c2;
    Word64 sum1L, sum2L, rndVal;

    rndVal = (Word64)( 1 << (DEF_NFRACBITS - 1 + (32 - CSHIFT)) );

    /* special case, output sample 0 */
    coef = coefBase;
    vb1 = vbuf;
    sum1L = rndVal;

    MC0M(0)
    MC0M(1)
    MC0M(2)
    MC0M(3)
    MC0M(4)
    MC0M(5)
    MC0M(6)
    MC0M(7)

    *(pcm + 0) = ClipToShort((int)SAR64(sum1L, (32 - CSHIFT)), DEF_NFRACBITS);

    /* special case, output sample 16 */
    coef = coefBase + 256;
    vb1 = vbuf + 64 * 16;
    sum1L = rndVal;

    MC1M(0)
    MC1M(1)
    MC1M(2)
    MC1M(3)
    MC1M(4)
    MC1M(5)
    MC1M(6)
    MC1M(7)

    *(pcm + 16) = ClipToShort((int)SAR64(sum1L, (32 - CSHIFT)), DEF_NFRACBITS);

    /* main convolution loop: sum1L = samples 1, 2, 3, ... 15   sum2L = samples 31, 30, ... 17 */
    coef = coefBase + 16;
    vb1 = vbuf + 64;
    pcm++;

    /* right now, the compiler creates bad asm from this... */
    for (i = 15; i > 0; i--) {
        sum1L = sum2L = rndVal;

        MC2M(0)
        MC2M(1)
        MC2M(2)
        MC2M(3)
        MC2M(4)
        MC2M(5)
        MC2M(6)
        MC2M(7)

        vb1 += 64;
        *(pcm)       = ClipToShort((int)SAR64(sum1L, (32 - CSHIFT)), DEF_NFRACBITS);
        *(pcm + 2 * i) = ClipToShort((int)SAR64(sum2L, (32 - CSHIFT)), DEF_NFRACBITS);
        pcm++;
    }
}

#define MC0S(x) { \
    c1 = *coef;     coef++;     c2 = *coef;     coef++; \
    vLo = *(vb1+(x));       vHi = *(vb1+(23-(x))); \
    sum1L = MADD64(sum1L, vLo,  c1);    sum1L = MADD64(sum1L, vHi, -c2); \
    vLo = *(vb1+32+(x));    vHi = *(vb1+32+(23-(x))); \
    sum1R = MADD64(sum1R, vLo,  c1);    sum1R = MADD64(sum1R, vHi, -c2); \
}

#define MC1S(x) { \
    c1 = *coef;     coef++; \
    vLo = *(vb1+(x)); \
    sum1L = MADD64(sum1L, vLo,  c1); \
    vLo = *(vb1+32+(x)); \
    sum1R = MADD64(sum1R, vLo,  c1); \
}

#define MC2S(x) { \
        c1 = *coef;     coef++;     c2 = *coef;     coef++; \
        vLo = *(vb1+(x));   vHi = *(vb1+(23-(x))); \
        sum1L = MADD64(sum1L, vLo,  c1);    sum2L = MADD64(sum2L, vLo,  c2); \
        sum1L = MADD64(sum1L, vHi, -c2);    sum2L = MADD64(sum2L, vHi,  c1); \
        vLo = *(vb1+32+(x));    vHi = *(vb1+32+(23-(x))); \
        sum1R = MADD64(sum1R, vLo,  c1);    sum2R = MADD64(sum2R, vLo,  c2); \
        sum1R = MADD64(sum1R, vHi, -c2);    sum2R = MADD64(sum2R, vHi,  c1); \
}

/**************************************************************************************
 * Function:    PolyphaseStereo
 *
 * Description: filter one subband and produce 32 output PCM samples for each channel
 *
 * Inputs:      pointer to PCM output buffer
 *              number of "extra shifts" (vbuf format = Q(DQ_FRACBITS_OUT-2))
 *              pointer to start of vbuf (preserved from last call)
 *              start of filter coefficient table (in proper, shuffled order)
 *              no minimum number of guard bits is required for input vbuf
 *                (see additional scaling comments below)
 *
 * Outputs:     32 samples of two channels of decoded PCM data, (i.e. Q16.0)
 *
 * Return:      none
 *
 * Notes:       interleaves PCM samples LRLRLR...
 *
 * TODO:        add 32-bit version for platforms where 64-bit mul-acc is not supported
 **************************************************************************************/
static void PolyphaseStereo(short* pcm, int* vbuf, const int* coefBase)
{
    int i;
    const int* coef;
    int* vb1;
    int vLo, vHi, c1, c2;
    Word64 sum1L, sum2L, sum1R, sum2R, rndVal;

    rndVal = (Word64)( 1 << (DEF_NFRACBITS - 1 + (32 - CSHIFT)) );

    /* special case, output sample 0 */
    coef = coefBase;
    vb1 = vbuf;
    sum1L = sum1R = rndVal;

    MC0S(0)
    MC0S(1)
    MC0S(2)
    MC0S(3)
    MC0S(4)
    MC0S(5)
    MC0S(6)
    MC0S(7)

    *(pcm + 0) = ClipToShort((int)SAR64(sum1L, (32 - CSHIFT)), DEF_NFRACBITS);
    *(pcm + 1) = ClipToShort((int)SAR64(sum1R, (32 - CSHIFT)), DEF_NFRACBITS);

    /* special case, output sample 16 */
    coef = coefBase + 256;
    vb1 = vbuf + 64 * 16;
    sum1L = sum1R = rndVal;

    MC1S(0)
    MC1S(1)
    MC1S(2)
    MC1S(3)
    MC1S(4)
    MC1S(5)
    MC1S(6)
    MC1S(7)

    *(pcm + 2 * 16 + 0) = ClipToShort((int)SAR64(sum1L, (32 - CSHIFT)), DEF_NFRACBITS);
    *(pcm + 2 * 16 + 1) = ClipToShort((int)SAR64(sum1R, (32 - CSHIFT)), DEF_NFRACBITS);

    /* main convolution loop: sum1L = samples 1, 2, 3, ... 15   sum2L = samples 31, 30, ... 17 */
    coef = coefBase + 16;
    vb1 = vbuf + 64;
    pcm += 2;

    /* right now, the compiler creates bad asm from this... */
    for (i = 15; i > 0; i--) {
        sum1L = sum2L = rndVal;
        sum1R = sum2R = rndVal;

        MC2S(0)
        MC2S(1)
        MC2S(2)
        MC2S(3)
        MC2S(4)
        MC2S(5)
        MC2S(6)
        MC2S(7)

        vb1 += 64;
        *(pcm + 0)         = ClipToShort((int)SAR64(sum1L, (32 - CSHIFT)), DEF_NFRACBITS);
        *(pcm + 1)         = ClipToShort((int)SAR64(sum1R, (32 - CSHIFT)), DEF_NFRACBITS);
        *(pcm + 2 * 2 * i + 0) = ClipToShort((int)SAR64(sum2L, (32 - CSHIFT)), DEF_NFRACBITS);
        *(pcm + 2 * 2 * i + 1) = ClipToShort((int)SAR64(sum2R, (32 - CSHIFT)), DEF_NFRACBITS);
        pcm += 2;
    }
}
